Lab 6 - Poisson - Answers

Author

Jason Geller

Published

March 29, 2024

  1. To complete this lab:
Code
library(MASS)
library(tidyverse)
library(emmeans)
library(ggeffects)
library(easystats)
library(performance)
library(knitr)
Code
library(tidyverse)

data <- read_delim("https://raw.githubusercontent.com/jgeller112/psy504-advanced-stats/main/slides/Poisson/data/2010.csv")
  1. Conduct the analysis described in the preregistration document
  1. The number of hours per week that a person spends on the Internet (“WWWHR”) will
    be predicted by their vocabulary (“WORDSUM”), age (“AGE”), sex (“SEX”), religiosity
    (“RELITEN”), political orientation (“POLVIEWS”), and how often they work from home
    (“WRKHOME”).
Code
data_pos <- data %>%
  dplyr::select(wwwhr, wordsum, age, sex, reliten, polviews, wrkhome) %>%
  mutate(wwwhr=case_when(
    wwwhr==-1 ~ NA_real_, 
    wwwhr==998 ~ NA_real_, 
    wwwhr==999 ~ NA_real_,
    TRUE ~ wwwhr), 
    wordsum=case_when(
    wordsum==-1 ~ NA_real_,
    wordsum==99 ~ NA_real_, 
    TRUE ~ wordsum), 
    case_when(
    reliten==0 ~ NA_real_, 
    reliten==8 ~ NA_real_, 
    reliten==9 ~ NA_real_, 
    TRUE~reliten), 
    polviews=case_when(
    polviews==0 ~ NA_real_, 
    polviews==8 ~ NA_real_, 
    polviews==9 ~ NA_real_, 
    TRUE ~ polviews), 
    wrkhome=case_when(
    wrkhome==0 ~ NA_real_, 
    wrkhome==8 ~ NA_real_, 
    wrkhome==9 ~ NA_real_, 
    TRUE ~ wrkhome), 
    age = case_when(
    age == 0 ~ NA_real_,
    age == 98 ~ NA_real_,
    age == 99 ~ NA_real_,
    TRUE ~ age))

NANIAR

  • could use the naniar function replace_with_na
Code
library(naniar)

data_pos <- data %>%
  dplyr::select(wwwhr, wordsum, age, sex, reliten, polviews, wrkhome) %>%
replace_with_na(.,
             replace = list(wwwhr = c(-1, 998, 999),
                          wordsum = c(-1, 99),
                          reliten = c(0, 8, 9), 
             polviews = c(0, 8, 9), 
             wrkhome = c(0,8,9), 
             age=c(0, 98, 99)))
  • Recode sex as factor
Code
data_pos <- data_pos %>%
  mutate(sex=as.factor(sex))
  • Recode reliten

    ::: {.cell}

    Code
    data_pos <- data_pos %>%
      mutate(reliten_recode = case_when(
        reliten==3 ~2, 
        reliten==2 ~ 3, 
        TRUE ~ reliten
      ))
    
    data_pos %>%
      dplyr::select(reliten, reliten_recode)

    ::: {.cell-output .cell-output-stdout}

    # A tibble: 2,044 × 2
       reliten reliten_recode
         <dbl>          <dbl>
     1       1              1
     2       4              4
     3       1              1
     4       1              1
     5       1              1
     6       4              4
     7       3              2
     8       1              1
     9       1              1
    10       1              1
    # ℹ 2,034 more rows

    ::: :::

Missingness

Code
library(skimr)
skimr::skim(data_pos)
Name data_pos
Number of rows 2044
Number of columns 8
_______________________
Column type frequency:
factor 1
numeric 7
________________________
Group variables None
Data summary

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
sex 0 1 FALSE 2 -1: 1153, 1: 891

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
wwwhr 996 0.51 9.79 13.41 0 2 5 14 168 ▇▁▁▁▁
wordsum 657 0.68 6.03 2.07 0 5 6 7 10 ▁▃▇▅▂
age 3 1.00 47.97 17.68 18 33 47 61 89 ▇▇▇▅▃
reliten 99 0.95 2.08 1.08 1 1 2 3 4 ▇▇▁▂▃
polviews 71 0.97 4.08 1.46 1 3 4 5 7 ▃▂▇▃▅
wrkhome 882 0.57 2.26 1.72 1 1 1 4 6 ▇▁▁▂▁
reliten_recode 99 0.95 2.39 1.16 1 1 3 3 4 ▇▂▁▇▃

Fit Poisson

Code
pos_mod <- glm(wwwhr~wordsum+age+sex+reliten_recode+polviews+wrkhome, family=poisson(), data=data_pos)

Checking Assumptions

Performance - check_model

Code
check_model(pos_mod)

Code
outliers_list <- check_outliers(pos_mod) # Find outliers
outliers_list # Show the row index of the outliers
2 outliers detected: cases 72, 363.
- Based on the following method and threshold: cook (0.9).
- For variable: (Whole model).
Code
outliers_list <- as.numeric(outliers_list) # The object is a binary vector...

filtered_data <- as.data.frame(data_pos)[!outliers_list, ] # And can be used to filter a dataframe
  • Refit after outlier exclusion
Code
pos_mod <- glm(wwwhr~wordsum+age+sex+reliten_recode+polviews+wrkhome, family=poisson, data=filtered_data)

model_parameters(pos_mod) %>%
  print_html()
Parameter Coefficient SE 95% CI z p
(Intercept) 1.56 0.09 (1.39, 1.72) 18.19 < .001
wordsum 0.11 7.72e-03 (0.09, 0.12) 13.76 < .001
age -0.02 1.08e-03 (-0.02, -0.02) -16.11 < .001
sex (1) 0.25 0.03 (0.20, 0.30) 9.35 < .001
reliten recode 0.21 0.01 (0.18, 0.23) 16.02 < .001
polviews -0.04 9.75e-03 (-0.06, -0.02) -3.77 < .001
wrkhome 0.08 7.67e-03 (0.07, 0.10) 10.50 < .001

Overdispersion

Code
library(performance)

check_overdispersion(pos_mod)
# Overdispersion test

       dispersion ratio =   14.732
  Pearson's Chi-Squared = 8750.592
                p-value =  < 0.001
  • Our model appears to be over-dispersed. Let’s fit a negative binomial!

  • Let’s check if the negative binomial solved are problem:

Code
library(MASS)

pos_mod_nb <- glm.nb(wwwhr~wordsum+age+sex+reliten_recode+polviews+wrkhome, data=filtered_data)

model_parameters(pos_mod_nb) %>%
  print_html()
Parameter Coefficient SE 95% CI z p
(Intercept) 1.66 0.28 (1.12, 2.21) 6.00 < .001
wordsum 0.11 0.03 (0.05, 0.16) 4.19 < .001
age -0.02 3.50e-03 (-0.02, -0.01) -4.91 < .001
sex (1) 0.16 0.09 (-0.02, 0.34) 1.79 0.073
reliten recode 0.20 0.04 (0.12, 0.27) 4.81 < .001
polviews -0.04 0.03 (-0.10, 0.03) -1.10 0.270
wrkhome 0.06 0.03 (7.36e-03, 0.12) 2.25 0.025

Which one is better?

Code
test_likelihoodratio(pos_mod, pos_mod_nb)
# Likelihood-Ratio-Test (LRT) for Model Comparison (ML-estimator)

Name       |  Model | df | df_diff |    Chi2 |      p
-----------------------------------------------------
pos_mod    |    glm |  7 |         |         |       
pos_mod_nb | negbin |  8 |       1 | 4606.62 | < .001
  • Negative binom appears to be better!
Code
check_zeroinflation(pos_mod_nb)
# Check for zero-inflation

   Observed zeros: 40
  Predicted zeros: 67
            Ratio: 1.68
  • No zero-inflation!
Code
model_parameters(pos_mod_nb) %>%
  print_html()
Parameter Coefficient SE 95% CI z p
(Intercept) 1.66 0.28 (1.12, 2.21) 6.00 < .001
wordsum 0.11 0.03 (0.05, 0.16) 4.19 < .001
age -0.02 3.50e-03 (-0.02, -0.01) -4.91 < .001
sex (1) 0.16 0.09 (-0.02, 0.34) 1.79 0.073
reliten recode 0.20 0.04 (0.12, 0.27) 4.81 < .001
polviews -0.04 0.03 (-0.10, 0.03) -1.10 0.270
wrkhome 0.06 0.03 (7.36e-03, 0.12) 2.25 0.025
Code
model_parameters(pos_mod_nb, exponentiate = TRUE) %>%
  print_html()
Parameter Coefficient SE 95% CI z p
(Intercept) 5.26 1.46 (3.06, 9.09) 6.00 < .001
wordsum 1.11 0.03 (1.06, 1.18) 4.19 < .001
age 0.98 3.44e-03 (0.98, 0.99) -4.91 < .001
sex (1) 1.17 0.11 (0.98, 1.41) 1.79 0.073
reliten recode 1.22 0.05 (1.12, 1.32) 4.81 < .001
polviews 0.96 0.03 (0.90, 1.03) -1.10 0.270
wrkhome 1.06 0.03 (1.01, 1.12) 2.25 0.025

The analysis employed a negative binomial regression model in R to examine the relationship between sex, age, vocabulary, religious and political views, and time spent on the internet per hour. An over-dispersion test was conducted using the performance package in R to determine the appropriateness of the model.

The analysis revealed that each additional point in the vocabulary test was associated with an increase of 11% in the expected count of hours spent on the internet per week, incidence rate ratio (IRR) = 1.11, SE = .01, p < .001. Increases in religiosity and work-from-home frequency were also associated with increased time spent on the internet, IRR = 1.10, SE = .03, p < .001, and IRR = 1.06, SE = .02, p < .001, respectively. Relative to females, males in our dataset spent more hours online, IRR = 1.28, SE = .05, p < .001, d = 1.28. Two variables, age and political orientation, were negatively associated with time spent on the internet. Older respondents and the more politically conservative spent less time online, IRR = .98, SE = .002, p < .001, d = .98, and IRR = .96, SE = .02, p = .01, d = .96, respectively.